Beta Regression

Code
library(zoib)
library(tidyverse)
library(GGally)
library(ggpubr)
library(kableExtra)

theme_set(theme_pubr(legend = "bottom"))

# read in raw data
nnns0 <- read.csv("../NNNS_score_data.csv")

Using Haojia’s data cleaning:

Code
# clean up variable names
# remove the dots at the end of variable names
colnames(nnns0) <- gsub("\\.+$", "", colnames(nnns0))
# replace all the dots in variable names with underscores
colnames(nnns0) <- gsub("\\.+", "_", colnames(nnns0))

nnns <- nnns0 |> 
  filter(
    # exclude neurologic or airway anomaly
    Neurologic_Complication == 0, AirwayAnomalyYN == 0,
    # include infants from birth to 4 weeks old
    # there are two outliers with age at surgery > 30 days
    Age_at_Surgery_days <= 30
  ) |>
  # some binary variables have values of 1/2 or Y/N, recode them to 0/1
  mutate(
    Female = as.integer(sex_1_M_2_F == 2),
    Premature = as.integer(Premature == 1),
    Extubation_failure = as.integer(Extubation_failure == "Y"),
  ) |>
  
  # relabel cardiac anatomy
  mutate(
    Cardiac_Anatomy = factor(
      Cardiac_Anatomy, levels = 1:4,
      labels = c(
        "Single ventricle w/o arch obstruction",
        "Single ventricle w/ arch obstruction",
        "Two ventricle w/o arch obstruction",
        "Two ventricle w/ arch obstruction"
      )
    ),
    # for model building purposes, combine the 2 levels w/o arch obstruction
    Cardiac_Anatomy_collapsed = fct_collapse(
      Cardiac_Anatomy,
      "W/o arch obstruction" = c("Single ventricle w/o arch obstruction", "Two ventricle w/o arch obstruction"),
      "Single ventricle w/ arch obstruction" = "Single ventricle w/ arch obstruction",
      "Two ventricle w/ arch obstruction" = "Two ventricle w/ arch obstruction"
    )
  ) |>
  
  # convert date variables to date class
  mutate_at(
    vars("Date_PO_feeds_started", "Date_Reaching_Full_PO", "Date_Identified_as_not_yet_full_PO"), 
    as_date, format = "%m/%d/%Y"
  )

# drop unnecessary variables
nnns <- nnns |> select(!c(
  "sex_1_M_2_F", # use Female instead
  "Intubated_Pre_operatively", "bypass_used", "bypass_time_min", # not of interest 
  "Neurologic_Complication", "AirwayAnomalyYN" # already excluded
)) 

Use ZOIB package

ZOIB Model

\[ f(y_i|\eta_i) = \begin{cases} p_i, & y_i =0 \\ (1-p_i)q_i, & y_i =1 \\ (1 − p_i)(1 − q_i)Beta(\alpha_{i_1}, \alpha_{i_2}) &y_i \in (0, 1) \end{cases} \]

\[ \begin{aligned} logit\left(\mu^{(0,1)} = \frac{\alpha_1}{\alpha_1+ \alpha_2}\right) = \mathbf x_1 \boldsymbol \beta_1 + I_1(\mathbf z_{1,i}\gamma_{1,i}) \\ log\left(v_i = \alpha_1+ \alpha_2\right) = \mathbf x_2 \boldsymbol \beta_2 + I_2(\mathbf z_{2,i}\gamma_{2,i}) \\ logit\left(p_i\right) = \mathbf x_3 \boldsymbol \beta_3 + I_1(\mathbf z_{3,i}\gamma_{3,i}) \\ logit\left(q_i\right) = \mathbf x_4 \boldsymbol \beta_4 + I_1(\mathbf z_{4,i}\gamma_{4 ,i}) \end{aligned} \]

Fit ZOIB model

Code
set.seed(11282023) #Set seed, bayesian model!

tictoc::tic()
model1 =zoib(Percent_of_feeds_taken_by_mouth_at_discharge #Response
            ~Pre_Op_NNNS_attention_score+
              Post_Op_NNNS_attention_score+
              Female+Genetic_Syndrome_or_Chromosomal_Abnormality+
              Age_at_Surgery_days+Cardiac_Anatomy+
              Length_of_intubation_days+Length_of_Stay_days+
              Extubation_failure+GI_Complication+Premature
              #x1 design matrix
       | 1 #x2 design matrix- estimating variance
     |Pre_Op_NNNS_attention_score+
              Post_Op_NNNS_attention_score+
              Female+Genetic_Syndrome_or_Chromosomal_Abnormality+
              Age_at_Surgery_days+Cardiac_Anatomy+
              Length_of_intubation_days+Length_of_Stay_days+
              Extubation_failure+GI_Complication+Premature #x3 design matrix
     |Pre_Op_NNNS_attention_score+
              Post_Op_NNNS_attention_score+
              Female+Genetic_Syndrome_or_Chromosomal_Abnormality+
              Age_at_Surgery_days+Cardiac_Anatomy+
              Length_of_intubation_days+Length_of_Stay_days+
              Extubation_failure+GI_Complication+Premature, #x4 design matrix
     data = nnns,
     n.response = 1,
     zero.inflation = TRUE,
     one.inflation = TRUE,
     link.mu = "logit",
     link.x0 = "logit",
     link.x1 =  "logit",
     random = 0,
     n.chain = 4, # number of MCMC chains
     n.iter = 10000, #number of iterations before burn/thin
     n.thin = 10, # thinning period
     n.burn = 5000, # burn-in period 
     seeds = c(11,29,20,23) #vector of seeds for chains to make reproducable
     )
tictoc::toc()
saveRDS(model1, file="model1.RData")

Interpreting output:

b: vector of estimates from Eqn 1; that is, g(mu) = xb*b +z*gamma

d: vector of estimates from Eqn 2; that is, log(eta) = xd*d+z*gamma

b0: vector of estimates from Eqn 3; that is, g(p0) = x0*b0 +z*gamma

b1: vector of estimates from Eqn ;4 that is, g(p1) = x1*b1+z*gamma

Code
model1 =readRDS(file="model1.RData")

model1$coeff |> summary()

Iterations = 1:1000
Thinning interval = 1 
Number of chains = 4 
Sample size per chain = 1000 

1. Empirical mean and standard deviation for each variable,
   plus standard error of the mean:

             Mean        SD Naive SE Time-series SE
b[1]     -8.09465   3.61077 0.057091       0.086664
b[2]      0.16754   0.30532 0.004828       0.006510
b[3]      1.41611   0.54994 0.008695       0.012870
b[4]     -0.77692   0.77662 0.012279       0.017591
b[5]      0.56905   1.07698 0.017029       0.028173
b[6]     -0.04381   0.12693 0.002007       0.004144
b[7]      1.27294   1.10613 0.017489       0.024815
b[8]     -1.49771   0.75022 0.011862       0.016137
b[9]     -1.36771   0.91489 0.014466       0.019695
b[10]     0.12185   0.24224 0.003830       0.006870
b[11]    -0.01628   0.07887 0.001247       0.002793
b[12]    -0.94204   1.89595 0.029978       0.056783
b[13]     0.36259   1.46884 0.023224       0.035131
b[14]     1.20931   1.23171 0.019475       0.026968
b0[1]   -26.42023  19.59217 0.309779       3.034188
b0[2]    -1.90012   1.33031 0.021034       0.033666
b0[3]    -0.17756   1.26060 0.019932       0.025094
b0[4]    -1.31170   1.79427 0.028370       0.033291
b0[5]    -0.56441   2.46492 0.038974       0.050618
b0[6]    -1.66999   0.68567 0.010841       0.022948
b0[7]    34.21280  17.36400 0.274549       3.293759
b0[8]    26.79215  17.24595 0.272682       3.010231
b0[9]    33.87990  17.29435 0.273448       3.139413
b0[10]    1.02017   0.67087 0.010607       0.015519
b0[11]    0.36265   0.16344 0.002584       0.005151
b0[12]  -11.40039   5.46852 0.086465       0.168737
b0[13]   -6.40822   4.21461 0.066639       0.105051
b0[14]   -1.27472   3.02047 0.047758       0.056843
b1[1]  -237.08487 121.44880 1.920274       8.186235
b1[2]    10.25191  13.30420 0.210358       0.890090
b1[3]    20.56138  23.44149 0.370643       1.755172
b1[4]    33.35014  29.12542 0.460513       2.346221
b1[5]   -64.26438  43.41481 0.686448       3.460382
b1[6]     5.79882   3.94782 0.062421       0.270918
b1[7]    25.88272  41.17875 0.651093       3.887651
b1[8]   -25.73807  46.95655 0.742448       4.039620
b1[9]    49.72144  32.03913 0.506583       2.605644
b1[10]    2.79552   9.38518 0.148393       0.820767
b1[11]   -0.62349   1.50991 0.023874       0.127070
b1[12]  -12.46243  67.82066 1.072339       5.711628
b1[13]  -66.75614  66.37378 1.049462       5.622294
b1[14]  -16.30196  68.66405 1.085674       6.924297
d         1.69038   0.42303 0.006689       0.008758

2. Quantiles for each variable:

            2.5%        25%        50%        75%     97.5%
b[1]    -14.8474  -10.48537   -8.11701   -5.76482  -0.71873
b[2]     -0.4539   -0.03206    0.17530    0.37322   0.73392
b[3]      0.3054    1.06533    1.43189    1.78935   2.49005
b[4]     -2.2652   -1.30988   -0.78185   -0.26866   0.74620
b[5]     -1.5229   -0.13971    0.57724    1.28133   2.65090
b[6]     -0.2974   -0.12447   -0.04508    0.03760   0.20562
b[7]     -0.8719    0.54888    1.24323    1.96193   3.54529
b[8]     -2.8418   -2.01442   -1.53797   -1.02424   0.06508
b[9]     -3.0918   -2.00869   -1.40385   -0.76150   0.46411
b[10]    -0.3694   -0.03303    0.12149    0.27391   0.59904
b[11]    -0.1849   -0.06551   -0.01220    0.03614   0.12824
b[12]    -4.4815   -2.17242   -1.04193    0.17430   3.12491
b[13]    -2.4819   -0.62871    0.36894    1.34162   3.31607
b[14]    -1.1930    0.39759    1.18757    2.01486   3.66590
b0[1]   -66.6996  -40.42623  -24.71704  -11.53378   6.47311
b0[2]    -4.7803   -2.73308   -1.81381   -1.00625   0.47801
b0[3]    -2.7282   -0.98068   -0.19594    0.64975   2.38081
b0[4]    -5.0127   -2.44438   -1.30167   -0.13431   2.20775
b0[5]    -5.7475   -2.17587   -0.48875    1.11533   4.00194
b0[6]    -3.1933   -2.07548   -1.59464   -1.17127  -0.51880
b0[7]     7.3892   19.53460   31.89964   47.67642  68.90797
b0[8]     0.5156   12.51126   24.39181   40.07407  61.85394
b0[9]     7.3885   19.65624   31.50292   47.13116  68.88510
b0[10]   -0.1272    0.54728    0.97938    1.44124   2.48895
b0[11]    0.0770    0.24637    0.35128    0.46016   0.72186
b0[12]  -23.2582  -14.76718  -10.99724   -7.55044  -1.96659
b0[13]  -15.2413   -9.04412   -6.08288   -3.46518   1.15036
b0[14]   -7.7286   -3.12670   -1.10839    0.76417   4.10107
b1[1]  -500.0818 -312.38491 -230.56351 -152.76696 -22.78017
b1[2]   -14.6438    1.70753    9.67538   18.62647  38.78106
b1[3]   -24.8474    4.35148   19.84623   36.96641  66.35291
b1[4]   -18.4475   12.97897   31.41279   51.19150  94.14057
b1[5]  -160.4817  -91.65415  -60.79211  -33.70751  10.31647
b1[6]    -1.3629    3.07907    5.60600    8.39559  13.98074
b1[7]   -52.0082   -2.25615   24.29563   54.02553 106.77353
b1[8]  -119.0373  -57.07532  -24.94217    5.00622  65.21001
b1[9]    -7.5283   27.68274   47.62888   69.52931 118.46872
b1[10]  -16.6639   -3.34039    2.90113    8.92324  21.56982
b1[11]   -3.5802   -1.58695   -0.62721    0.37284   2.37157
b1[12] -157.2316  -54.54321   -6.41170   34.65294 106.95123
b1[13] -206.2028 -108.41730  -62.75804  -21.79855  55.90084
b1[14] -161.1771  -59.30652  -18.53089   27.97054 123.19694
d         0.8313    1.41093    1.70051    1.97617   2.48427

Check convergence

Traceplots

Code
traceplot(model1$coeff)

Autocorrelation plots

Code
autocorr.plot(model1$coeff)

Gelman Plot

Code
gelman.diag(model1$coeff)
Potential scale reduction factors:

       Point est. Upper C.I.
b[1]         1.00       1.00
b[2]         1.00       1.00
b[3]         1.00       1.00
b[4]         1.00       1.00
b[5]         1.00       1.01
b[6]         1.00       1.01
b[7]         1.00       1.01
b[8]         1.00       1.01
b[9]         1.00       1.01
b[10]        1.00       1.01
b[11]        1.01       1.02
b[12]        1.01       1.02
b[13]        1.00       1.01
b[14]        1.00       1.00
b0[1]        1.14       1.39
b0[2]        1.00       1.01
b0[3]        1.00       1.01
b0[4]        1.00       1.00
b0[5]        1.00       1.00
b0[6]        1.00       1.00
b0[7]        1.22       1.60
b0[8]        1.22       1.60
b0[9]        1.23       1.62
b0[10]       1.00       1.01
b0[11]       1.00       1.00
b0[12]       1.00       1.00
b0[13]       1.00       1.00
b0[14]       1.00       1.01
b1[1]        1.01       1.01
b1[2]        1.01       1.01
b1[3]        1.01       1.01
b1[4]        1.02       1.07
b1[5]        1.03       1.09
b1[6]        1.04       1.11
b1[7]        1.05       1.12
b1[8]        1.08       1.22
b1[9]        1.02       1.04
b1[10]       1.02       1.07
b1[11]       1.03       1.06
b1[12]       1.04       1.11
b1[13]       1.06       1.18
b1[14]       1.01       1.03
d            1.00       1.00

Multivariate psrf

1.24